• DOMAIN: Automobile
• CONTEXT: The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes
• DATA DESCRIPTION: The data concerns city-cycle fuel consumption in miles per gallon
Attribute Information:
mpg: continuous
cylinders: multi-valued discrete
displacement: continuous
horsepower: continuous
weight: continuous
acceleration: continuous
model year: multi-valued discrete
origin: multi-valued discrete
car name: string (unique for each instance)
• PROJECT OBJECTIVE: Goal is to cluster the data and treat them as individual datasets to train Regression models to predict ‘mpg’
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, recall_score
from sklearn.metrics import precision_score, f1_score
from scipy.stats import zscore
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from scipy.spatial.distance import cdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist #Pairwise distribution between data points
car_name = pd.read_csv('Part1 - Car name.csv')
print('car_name Dataframe shape is : {}'.format(car_name.shape))
car_attributes = pd.read_json("Part1 - Car-Attributes.json")
print("car_attributes json dataframe shape is : {}".format(car_attributes.shape))
car_name.info()
1) Total number of rows in car_name dataset are 398
2) Total number of columns in car_name dataset are 1
3) 1 column is with Object dataype
6) car_name Column in car_name dataframe is non-null
car_attributes.info()
1) Total number of rows in car_attributes dataset are 398
2) Total number of columns in car_attributes dataset are 8
3) 3 columns are with float64 datatype
4) 4 columns are with int64 datatype
5) 1 column is with Object dataype
6) all the columns are non-null in car_attributes
car_data = car_name.join(car_attributes)
print("car_data Merged dataframe shape is : {}".format(car_data.shape))
car_data.info()
1) Total number of rows in car_data dataset are 398
2) Total number of columns in car_data dataset are 9
3) 3 columns are with float64 datatype
4) 4 columns are with int64 datatype
5) 2 column are with Object dataype now i.e. car_name and hp
6) all the columns are non-null in car_attributes
# Export to CSV, XLSX, JSON on local machine for future sue
car_data.to_csv("car_data.csv")
car_data.to_excel("car_data.xlsx")
car_data.to_json("car_data.json")
# Backup also for the fture use
car_data_backup = car_data.copy()
car_data.head()
Have already imported the datasets and have merged into the single dataset now i.e. car_data whcih is also exported to local machine in CSV, XLSX, JSON for future use
# Checking the presence of any null values
car_data.isnull().sum()
# Different method to check the NUll values where the data type is bool
car_data.isnull().any()
car_data.head()
# isdigit()? on 'horsepower'
hpIsDigit = pd.DataFrame(car_data.hp.str.isdigit()) # if the string is made of digits store True else False
#print isDigit = False!
car_data[hpIsDigit['hp'] == False] # from temp take only those rows where hp has false
# Replace missing values with NaN
car_data = car_data.replace('?', np.nan)
car_data[hpIsDigit['hp'] == False]
There are various ways to handle missing values. Drop the rows, replace missing values with median values etc. of the 398 rows 6 have NAN in the hp column. We could drop those 6 rows - which might not be a good idea under all situations
#instead of dropping the rows, lets replace the missing values with median value.
car_data.describe(include='all').T
# replace the missing values with median value.
car_data_1 = car_data.copy()
car_data['hp'] = car_data['hp'].fillna(car_data['hp'].median())
# car_data['hp'] = car_data['hp'].astype('float64') # converting the hp column from object / string type to float
print('Descriptive Stats before imputation for columns with missing values: \n', '--'*33)
display(car_data_1.describe(include='all').T)
print()
print('Descriptive Stats after imputation for columns with missing values: \n', '--'*33)
display(car_data.describe(include='all').T)
del car_data_1
A observation after imputating the missing values: Medians and Means remain unchanged basis on the imputation. Type of skewness remain unchanged.
# Checking the presence of any null values after missing value treatement
car_data.isnull().sum()
print("Unique values for Cylinder:", car_data['cyl'].unique())
print("Unique values for Year:", car_data['yr'].unique())
print("Unique values for Origin:", car_data['origin'].unique())
fig = plt.figure(figsize = (10,8))
sns.heatmap(car_data.corr(),annot=True,linewidths=.05);
# Fucntion for displaying the Correlation matrix for all variables basis on the Lower and upper threshold
def correlation_matrix(df, Lower_threshold = 0.8, Upper_threshold = 1.0):
corr = df.corr()
sort = corr.abs().unstack().drop_duplicates()
sort = sort.sort_values(kind = "quicksort", ascending = False)
display(sort[(sort >= Lower_threshold) & (sort <= Upper_threshold)])
print("Correlation between the pairs whose correlation is more than .90")
correlation_matrix(car_data, Lower_threshold = 0.9, Upper_threshold =1.0)
print("Correlation between the pairs whose correlation is beween .80 and .90")
correlation_matrix(car_data, Lower_threshold = 0.8, Upper_threshold =.9)
print("Correlation between the pairs whose correlation is beween .70 and .80")
correlation_matrix(car_data, Lower_threshold = 0.7, Upper_threshold =.8)
1) cyl and disp, disp and wt, disp and hp are correlated with each other with a correlation coeff greater than equal to 0.9.
2) cyl, disp and wt are independent variables and are highly corelated with each other, but lets not do that now because there are very less colmumns and there could be some dependency
3) car_name can also be dropped from the dataset as it is not going to serve any purpose during modelling and EDA.
car_data = car_data.drop(['car_name'], axis=1)
car_data.head()
car_data_dummy = pd.get_dummies(car_data, columns=['origin'])
car_data_dummy.head()
car_data_dummy = car_data_dummy.drop(['origin_3'],axis=1)
car_data.describe(include='all').T
Positively skewed: Most frequent values are low and tail is towards high values.
Negatively skewed: Most frequent values are high and tail is towards low values.
If Mode< Median< Mean then the distribution is positively skewed.
If Mode> Median> Mean then the distribution is negatively skewed.
mpg: Range of Q1 to Q3 is between 17.5 to 29. Right skeyed
hp: Range of Q1 to Q3 is 76 to 125. Right skeyed
acc: Range of Q1 to Q3 is 13.8 to 17.2. Normally distributed
yr: Range of Q1 to Q3 is 73 to 79. Normally distributed
disp: Range of Q1 to Q3 is 104.2 to 262.000. Right Skeyed
wt: Range of Q1 to Q3 is 2223.750 to 3608.000. Right Skeyed
# Check for Skewness of the columns if there is any
car_data.skew()
car_data.hist(figsize=(15,10));
# Checking the correlation between the variables
fig = plt.figure(figsize = (7,5))
sns.heatmap(car_data.corr(),annot=True,linewidths=.05);
# pairplot of the car_data after dropping the columns
sns.pairplot(car_data,diag_kind='kde');
1) miles per gallon is less when horse power is more and vice versa
2) miles per gallon is less when no of cylinders are more and and vice versa
3) miles per gallon is less when displacement is more and and vice versa
4) miles per gallon is less when weight is more and and vice versa
# Univariate boxplot of all the variables in Vehicle_out_fixed dataframe
fig = plt.figure(figsize = (16,20))
i=1
for col in car_data.columns:
plt.subplot(3, 3, i)
plt.title("Boxplot for \n No of cylinder vs {}".format(col))
sns.boxplot(data = car_data , y=col, x='cyl')
i=i+1
plt.show()
print(car_data['cyl'].value_counts())
car_data[car_data['cyl'] == 5]
print('origin and counts of cars when cyl are 3 : \n{}\n'.format(car_data[car_data['cyl'] == 3]['origin'].value_counts()))
print('origin and counts of cars when cyl are 4 : \n{}\n'.format(car_data[car_data['cyl'] == 4]['origin'].value_counts()))
print('origin and counts of cars when cyl are 5 : \n{}\n'.format(car_data[car_data['cyl'] == 5]['origin'].value_counts()))
print('origin and counts of cars when cyl are 6 : \n{}\n'.format(car_data[car_data['cyl'] == 6]['origin'].value_counts()))
print('origin and counts of cars when cyl are 8 : \n{}\n'.format(car_data[car_data['cyl'] == 8]['origin'].value_counts()))
# Univariate swarmplot of all the variables in car_data dataframe
fig = plt.figure(figsize = (16,20))
i=1
for col in car_data.columns:
plt.subplot(3, 3, i)
plt.title("swarmplot for \n No of cylinder vs {}".format(col))
sns.swarmplot(data = car_data , y=col, x='cyl')
i=i+1
plt.show()
car_data['cyl'].value_counts()
sns.countplot(car_data['cyl'])
# Univariate boxplot of all the variables in Vehicle_out_fixed dataframe
fig = plt.figure(figsize = (16,15))
i=1
for col in car_data.columns:
plt.subplot(3, 3, i)
plt.title("Boxplot for origin vs {}".format(col))
sns.boxplot(data = car_data , y=col, x='origin')
i=i+1
plt.show()
print('Cyl and counts of cars when origin is 1 : \n{}\n'.format(car_data[car_data['origin'] == 1]['cyl'].value_counts()))
print('Cyl and counts of cars when origin is 2 : \n{}\n'.format(car_data[car_data['origin'] == 2]['cyl'].value_counts()))
print('Cyl and counts of cars when origin is 3 : \n{}\n'.format(car_data[car_data['origin'] == 3]['cyl'].value_counts()))
# Plotting the Scatter plot and lineplot for tenure and TotalCharges
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.scatterplot(car_data["mpg"],car_data["hp"])
plt.subplot(2, 2, 2)
sns.scatterplot(car_data["mpg"],car_data["hp"])
plt.subplot(2, 2, 3)
sns.scatterplot(car_data["mpg"],car_data["hp"],hue=car_data["cyl"],palette='Set1')
plt.subplot(2, 2, 4)
sns.scatterplot(car_data["mpg"],car_data["hp"],hue=car_data["origin"],palette='Set1')
plt.show()
1) High Horsepower and high cylinders and low Miles per gallon. cars with cylinder 8 which have more horse power have less miles per gallon. cars with less cylinders i.e. have more miles per gallon.
2) Origin 1 cars are more distributed
3) cars from Origin 2 and 3 have high miles per gallon and less horsepower in comparison to cars from origin 1
# Plotting the countplot for cyl and yr with origin
fig = plt.figure(figsize = (15,5))
plt.subplot(1, 2, 1)
sns.countplot(car_data["yr"],hue=car_data["origin"])
plt.subplot(1, 2, 2)
sns.countplot(car_data["cyl"],hue=car_data["origin"])
plt.show()
# K-means clustering
# Scale the data
car_data_scaled = car_data_dummy.apply(zscore)
#Finding optimal no. of clusters
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(car_data_scaled)
prediction=model.predict(car_data_scaled)
meanDistortions.append(sum(np.min(cdist(car_data_scaled, model.cluster_centers_, 'euclidean'),
axis=1)) / car_data_scaled.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
car_data_Kmeans3 = KMeans(3)
car_data_Kmeans3.fit(car_data_scaled)
car_data['group'] = car_data_Kmeans3.predict(car_data_scaled)
car_data_dummy['group'] = car_data_Kmeans3.predict(car_data_scaled)
car_data_scaled['group'] = car_data_Kmeans3.predict(car_data_scaled)
car_data_groups = car_data.groupby(['group'])
car_data_dummy_groups = car_data_dummy.groupby(['group'])
car_data_scaled_groups = car_data_scaled.groupby(['group'])
car_data_scaled_groups.mean()
car_data_scaled.groupby(["group"]).count()
car_data_scaled_groups.boxplot(layout = (2,2),figsize=(16,10))
car_data_scaled_groups.mean()
Miles per gallon is Average
Number of cyinders are Average
Displacement is Average
Horsepower is Average
Weight is Average
Acceleration is High
year of model is in between
Miles per gallon is High
Number of cyinders are very low
Displacement is low
Horsepower is Low
Weight is low
Acceleration is Average
year of Model is high
One origin has majority
Miles per gallon is Low
Number of cyinders are High
Displacement is High
Horsepower is High
Weight is High
Acceleration is Low
year of model is low
car_data_scaled_hierarchical = car_data_scaled.iloc[:,:-1]
Z = linkage(car_data_scaled_hierarchical, metric='euclidean', method='average')
c, coph_dists = cophenet(Z , pdist(car_data_scaled_hierarchical))
c
plt.figure(figsize=(15, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8.)
plt.tight_layout()
car_data_hier6_model = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='average')
car_data_hier6_model.fit(car_data_scaled_hierarchical)
car_data_scaled['group6'] = car_data_hier6_model.labels_
car_data['group6'] = car_data_hier6_model.labels_
car_data_dummy['group6'] = car_data_hier6_model.labels_
car_data_scaled.groupby(["group6"]).count()
car_data_scaled_group6 = car_data_scaled.groupby(["group6"])
# car_data_dummy_group6 = car_data_scaled.groupby(["group6"])
car_data_group6 = car_data_scaled.groupby(["group6"])
car_data_scaled_group6.boxplot(layout = (2,3), figsize=(16,10))
No of Clusters = 3
car_data_dummy.head()
# Liner regression model
regression_model = LinearRegression()
def fit_linear_model(X, y, result):
# split data set in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
regression_model.fit(X_train, y_train)
result['TrainData_score'].append(regression_model.score(X_train, y_train))
result['TestData_score'].append(regression_model.score(X_test, y_test))
for idx, col_name in enumerate(X_train.columns):
coeff_col = '%s_co-eff' % col_name
if coeff_col not in result:
result[coeff_col] = []
result[coeff_col].append(regression_model.coef_[idx])
result['intercept'].append(regression_model.intercept_)
return result
result = {'Dataset':[], 'TrainData_score':[], 'TestData_score':[], 'intercept':[]}
# Based on clusters, let try to fit a linear model
for cluster_index in [0,1,2,'all']:
# Get the cluster dataset using cluster index/group
result['Dataset'].append(cluster_index)
if(cluster_index == 'all'):
car_data_dummy_group = car_data_scaled
else:
car_data_dummy_group = car_data_scaled[car_data_scaled['group'] == cluster_index]
# define X and y variables
X = car_data_dummy_group.drop(['mpg', 'group','group6'], axis=1)
y = car_data_dummy_group['mpg']
result = fit_linear_model(X, y, result)
pd.DataFrame(result)
From Above Dataframe, we can see that when No clusters are used and Linear model is created then Accuracy is pretty good i.e. 84%
Accuracy scores when Liner model is created for Cluster 0 is 79%
Accuracy scores when Liner model is created for Cluster 1 is 71%
Accuracy scores when Liner model is created for Cluster 2 is 69%
Performacne of models with the clustering is always Robust but in our case We have very less data and that could be the reason accuracy without the clustering is better
1) More data should be caputred so that Training and testing for the model creation can be accurate. Training with more data can give better accuracy for the MPG
2) Would be good if we can have a information on the number of persons driving the car. It usually happens when the car is run by a single have more mileage than car is driven by 2-3 person because the driving style is different of each person.
====================================================================================================================
• DOMAIN: Manufacturing
• CONTEXT: Company X curates and packages wine across various vineyards spread throughout the country
• DATA DESCRIPTION: The data concerns the chemical composition of the wine and its respective quality
Attribute Information:
A, B, C, D: specific chemical composition measure of the wine
Quality: quality of wine
• PROJECT OBJECTIVE: Goal is to build a synthetic data generation model using the existing data provided by the company
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, recall_score
from sklearn.metrics import precision_score, f1_score
from scipy.stats import zscore
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.svm import SVC
wine_data = pd.read_excel("Part2 - Company.xlsx")
print('wine_data Dataframe shape is : {}'.format(wine_data.shape))
wine_data.info()
wine_data.head()
# Checking the presence of any null values
wine_data.isnull().sum()
wine_data['Quality'].unique()
wine_data[wine_data['Quality'].isnull()]
wine_data.describe(include='all').T
wine_data['Quality'].value_counts()
Lets replace teh NAN with 'Quality NAN'
wine_data["Quality"] = wine_data["Quality"].fillna('Quality NAN')
wine_data['Quality'].value_counts()
wine_data_X = wine_data.copy()
wine_data_X = wine_data_X.drop('Quality',axis=1)
wine_data_y = wine_data['Quality']
wine_data_X
# Apply zscore and scale all the X attributes using standard mean=0 and st. deviation=1.
wine_data_X_Scaled=wine_data_X.apply(zscore)
wine_data_X_Scaled.head()
fig = plt.figure(figsize = (6,4))
sns.heatmap(wine_data.corr(),annot=True,linewidths=.05);
sns.pairplot(wine_data, diag_kind='kde');
# Lets use the K-means with K=2
wine_data_model = KMeans(2)
wine_data_model.fit(wine_data_X_Scaled)
wine_quality_prediction = wine_data_model.predict(wine_data_X_Scaled)
wine_data['Predicted_Quality'] = wine_quality_prediction
wine_data.head()
wine_data[wine_data['Quality'] == 'Quality NAN']
# Calculate accuracy of K-means by checking 'Quality' == 'Predicted_Quality'
def compare_predicted_quality(row):
if(row['Quality'] == "Quality A" and row['Predicted_Quality'] == 1):
return True
elif(row['Quality'] == "Quality B" and row['Predicted_Quality'] == 0):
return True
elif(row['Quality'] == "Quality NAN" and row['Predicted_Quality'] == 0):
return True
elif(row['Quality'] == "Quality NAN" and row['Predicted_Quality'] == 1):
return True
return False
def calculate_accuracy(df):
correct_predicted = 0
comp = df.apply(lambda row : compare_predicted_quality(row), axis=1)
for item in comp.items():
if item:
correct_predicted += 1
return correct_predicted*100/comp.size
print("Accuracy in prediction of Wine Quality:", calculate_accuracy(wine_data), "%")
# Univariate boxplot of all the variables in wine_data dataframe
fig = plt.figure(figsize = (13,10))
i=1
for col in wine_data_X.columns:
plt.subplot(2, 2, i)
plt.title("Boxplot for Quality vs Column {}".format(col))
sns.boxplot(data = wine_data , y=col, x='Quality')
i=i+1
plt.show()
# Univariate boxplot of all the variables in wine_data dataframe
fig = plt.figure(figsize = (13,10))
i=1
for col in wine_data_X.columns:
plt.subplot(2, 2, i)
plt.title("Boxplot for Quality vs Column {}".format(col))
# Stripplot for the predicted values where data was missing
sns.stripplot(data = wine_data[wine_data['Quality'] == 'Quality NAN'] , y=col, x='Predicted_Quality',color='b')
# BoxPlot for Quality A and B items
sns.boxplot(data = wine_data , y=col, x='Quality',order=['Quality B', 'Quality A'])
i=i+1
plt.show()
====================================================================================================================
• DOMAIN: Automobile
• CONTEXT: The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
• DATA DESCRIPTION: The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
• All the features are numeric i.e. geometric features extracted from the silhouette
• PROJECT OBJECTIVE: Apply dimensionality reduction technique – PCA and train a model using principal components instead of training the model using just the raw data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, learning_curve
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, recall_score
from sklearn.metrics import precision_score, f1_score
from scipy.stats import zscore
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.svm import SVC
Vehicle = pd.read_csv('Part3 - vehicle.csv')
print('Vehicle Dataframe shape is : {}'.format(Vehicle.shape))
Vehicle.info()
1) Total number of rows in Vehicle dataset are 846
2) Total number of columns in Vehicle dataset are 19
3) 14 columns are with Float64 datatype
4) 4 columns are with Int64 datatype
5) 1 column is with Object dataype
6) We can see that columns are having different non null rows which menas that there could be some missing values or the data type is not correct
# Checking the presence of any null values
Vehicle.isnull().sum()
# Different method to check the NUll values where the data type is bool
Vehicle.isnull().any()
circularity - 5 Nulls
distance_circularity - 4 Nulls
radius_ratio - 6 Nulls
pr.axis_aspect_ratio - 2 Nulls
scatter_ratio - 1 Nulls
elongatedness - 1 Nulls
pr.axis_rectangularity - 3 Nulls
scaled_variance - 3 Nulls
scaled_variance.1 - 2 Nulls
scaled_radius_of_gyration - 2 Nulls
scaled_radius_of_gyration.1 - 4 Nulls
skewness_about - 6 Nulls
skewness_about.1 - 1 Nulls
skewness_about.2 - 1 Nulls
Vehicle.describe(include='all').T
compactness, max.length_aspect_ratio, max.length_rectangularity, hollows_ratio, class has no missing values.
Rest all features have missing values.
All features are of numerical types except class which is a target variable and has three unique values.
# Check for Skewness of the columns if there is any
Vehicle.skew()
Positively skewed: Most frequent values are low and tail is towards high values.
Negatively skewed: Most frequent values are high and tail is towards low values.
If Mode< Median< Mean then the distribution is positively skewed.
If Mode> Median> Mean then the distribution is negatively skewed.
compactness: Range of Q1 to Q3 is between 87 to 100. Normally distributed.
circularity: Range of Q1 to Q3 is 40 to 49. Normally distributed.
distance_circularity: Range of Q1 to Q3 is 70 to 98. Mean is slightly greater than median, we can say that the column is slightly skewed towards right.
radius_ratio: Range of Q1 to Q3 is 141 to 195. Mean is slightly greater than median, we can say that the column is slightly skewed towards right.
pr.axis_aspect_ratio: Range of Q1 to Q3 is 57 to 65. Mean is slightly greater than median, we can say that the column is slightly skewed towards right.
max.length_aspect_ratio: Range of Q1 to Q3 is 7 to 10. Mean is slightly greater than median, we can say that the column is slightly skewed towards right.
scatter_ratio: Range of Q1 to Q3 is 147 to 198. Mean is greater than median, we can say that the column is skewed towards right.
elongatedness: Range of Q1 to Q3 is 33 to 46. Mean is less than median, we can say that the column is skewed towards left.
pr.axis_rectangularity: Range of Q1 to Q3 is 19 to 23. Mean is greater than median, we can say that the column is skewed towards right.
max.length_rectangularity: Range of Q1 to Q3 is 137 to 159. Mean is slightly greater than median, we can say that the column is skewed towards right.
scaled_variance: Range of Q1 to Q3 is 167 to 217. Mean is grater than median, we can say that the column is skewed towards right.
scaled_variance.1: Range of Q1 to Q3 is 318 to 587. Mean is greater than median, we can say that the column is skewed towards right.
scaled_radius_of_gyration: Range of Q1 to Q3 is 149 to 198. Mean is greater than median, we can say that the column is skewed towards right.
scaled_radius_of_gyration.1: Range of Q1 to Q3 is 67 to 75. Mean is greater than median, we can say that the column is skewed towards right.
skewness_about: Range of Q1 to Q3 is 2 to 6. Mean is greater than median, skewed towards right.
skewness_about.1: Range of Q1 to Q3 is 5 to 19. Mean is greater than median, skewed towards right.
skewness_about.2: Range of Q1 to Q3 is 184 to 193. Mean is slightly greater than median, almost normally distributed.
hollows_ratio: Range of Q1 to Q3 is 197 to 211. Mean is less than median, skewed towards left.
class : 3 Unique values where is car is havign the maximum frequency of 429
Below should be done for missing values treatement for the columns which are having NULLS
circularity - 5 Nulls - Normal Distribution - Mean should be used in pace of missing values
distance_circularity - 4 Nulls - Positively skeyed - Median should be used in pace of missing values
radius_ratio - 6 Nulls - Positively skeyed - Median should be used in pace of missing values
pr.axis_aspect_ratio - 2 Nulls - Positively skeyed - Median should be used in pace of missing values
scatter_ratio - 1 Null - Positively skeyed - Median should be used in pace of missing values
elongatedness - 1 Null - Positively skeyed - Median should be used in pace of missing values
pr.axis_rectangularity - 3 Nulls - Positively skeyed - Median should be used in pace of missing values
scaled_variance - 3 Nulls - Positively skeyed - Median should be used in pace of missing values
scaled_variance.1 - 2 Nulls - Positively skeyed - Median should be used in pace of missing values
scaled_radius_of_gyration - 2 Nulls - Positively skeyed - Median should be used in pace of missing values
scaled_radius_of_gyration.1 - 4 Nulls - Positively skeyed - Median should be used in pace of missing values
skewness_about - 6 Nulls - Positively skeyed - Median should be used in pace of missing values
skewness_about.1 - 1 Null - Positively skeyed - Median should be used in pace of missing values
skewness_about.2 - 1 Null - Normal Distribution - Mean should be used in pace of missing values
Vehicle1 = Vehicle.copy()
# Columns where MEAN is used to replace the missing values
Vehicle["circularity"] = Vehicle["circularity"].fillna(Vehicle["circularity"].mean())
Vehicle["skewness_about.2"] = Vehicle["skewness_about.2"].fillna(Vehicle["skewness_about.2"].mean())
# Columns where MEDIAN is used to replace the missing values
Vehicle["distance_circularity"] = Vehicle["distance_circularity"].fillna(Vehicle["distance_circularity"].median())
Vehicle["radius_ratio"] = Vehicle["radius_ratio"].fillna(Vehicle["radius_ratio"].median())
Vehicle["pr.axis_aspect_ratio"] = Vehicle["pr.axis_aspect_ratio"].fillna(Vehicle["pr.axis_aspect_ratio"].median())
Vehicle["scatter_ratio"] = Vehicle["scatter_ratio"].fillna(Vehicle["scatter_ratio"].median())
Vehicle["elongatedness"] = Vehicle["elongatedness"].fillna(Vehicle["elongatedness"].median())
Vehicle["pr.axis_rectangularity"] = Vehicle["pr.axis_rectangularity"].fillna(Vehicle["pr.axis_rectangularity"].median())
Vehicle["scaled_variance"] = Vehicle["scaled_variance"].fillna(Vehicle["scaled_variance"].median())
Vehicle["scaled_variance.1"] = Vehicle["scaled_variance.1"].fillna(Vehicle["scaled_variance.1"].median())
Vehicle["scaled_radius_of_gyration"] = Vehicle["scaled_radius_of_gyration"].fillna(Vehicle["scaled_radius_of_gyration"].median())
Vehicle["scaled_radius_of_gyration.1"] = Vehicle["scaled_radius_of_gyration.1"].fillna(Vehicle["scaled_radius_of_gyration.1"].median())
Vehicle["skewness_about"] = Vehicle["skewness_about"].fillna(Vehicle["skewness_about"].median())
Vehicle["skewness_about.1"] = Vehicle["skewness_about.1"].fillna(Vehicle["skewness_about.1"].median())
Vehicle2 = Vehicle.copy()
print('Descriptive Stats before imputation for columns with missing values: \n', '--'*33)
display(Vehicle1.describe(include='all').T)
print()
print('Descriptive Stats after imputation for columns with missing values: \n', '--'*33)
display(Vehicle2.describe(include='all').T)
del Vehicle1, Vehicle2
A observation after imputating the missing values: Medians and Means remain unchanged basis on the imputation. Type of skewness remain unchanged.
# Checking the presence of any null values after missing value treatement
Vehicle.isnull().sum()
Vehicle.head()
Vehicle["class"].unique()
Values are van, car, bus
for col in Vehicle.columns:
if Vehicle[col].dtype == 'int64' or Vehicle[col].dtype == 'float64':
continue
else:
print("Column name : {}".format(col))
print(Vehicle[col].value_counts())
print()
# Create a structure for replace for categorical variables
replaceStruct = {"class": {"car": 1, "bus": 2, "van": 3}}
Vehicle = Vehicle.replace(replaceStruct)
Vehicle.head()
car represents about 50.7% of the total values in class variable, bus about 25.8% and van about 23.5%.
Q1 = Vehicle.quantile(0.25)
Q3 = Vehicle.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
Vehicle_out_fixed = Vehicle.copy()
# Replace every outlier on the lower side by the lower whisker
for i, j in zip(np.where(Vehicle_out_fixed < Q1 - 1.5 * IQR)[0], np.where(Vehicle_out_fixed < Q1 - 1.5 * IQR)[1]):
Lower_whisker = Q1 - 1.5 * IQR
Vehicle_out_fixed.iloc[i,j] = Lower_whisker[j]
#Replace every outlier on the upper side by the upper whisker
for i, j in zip(np.where(Vehicle_out_fixed > Q3 + 1.5 * IQR)[0], np.where(Vehicle_out_fixed > Q3 + 1.5 * IQR)[1]):
Upper_whisker = Q3 + 1.5 * IQR
Vehicle_out_fixed.iloc[i,j] = Upper_whisker[j]
Vehicle_out_fixed.shape, Vehicle.shape
print('Descriptive Stats before Outlier removal: \n', '--'*33)
display(Vehicle.describe(include='all').T)
print()
print('Descriptive Stats after Outlier removal: \n', '--'*33)
display(Vehicle_out_fixed.describe(include='all').T)
Vehicle_out_fixed.hist(figsize=(15,15));
1) Instead of removing the outliers, which might have resulted in loss of data, we have replaced the outliers (using IQR method) with Upper fence and Lower fence values. Total number of rows are 846 as per the original dataset
2) Again, there's least effect on Means and Median.
3) With the help of Histogram, We can see that outliers are removed
Vehicle_out_fixed.describe(include='all').T
### Details about the 5 point summary is already mentioned earlier for all the columns
Vehicle_out_fixed.skew()
Positively skewed: Most frequent values are low and tail is towards high values.
Negatively skewed: Most frequent values are high and tail is towards low values.
If Mode< Median< Mean then the distribution is positively skewed.
If Mode> Median> Mean then the distribution is negatively skewed.
compactness - Normally distributed.
circularity - Normally distributed.
distance_circularity - Right skeyed
radius_ratio - Right skeyed
pr.axis_aspect_ratio - Right skeyed
max.length_aspect_ratio - Right skeyed
scatter_ratio - Right skeyed
elongatedness - Right skeyed
pr.axis_rectangularity - Right skeyed
max.length_rectangularity - Right skeyed
scaled_variance - Right skeyed
scaled_variance.1 - Right skeyed
scaled_radius_of_gyration - Right skeyed
scaled_radius_of_gyration.1 - Right skeyed
skewness_about - Right skeyed
skewness_about.1 - Right skeyed
skewness_about.2 - Normally distributed.
hollows_ratio - Left Skeyed
Vehicle_out_fixed.corr()
fig = plt.figure(figsize = (15,12))
sns.heatmap(Vehicle_out_fixed.corr(),annot=True,linewidths=.05);
# Fucntion for displaying the Correlation matrix for all variables basis on the Lower and upper threshold
def correlation_matrix(df, Lower_threshold = 0.8, Upper_threshold = 1.0):
corr = df.corr()
sort = corr.abs().unstack().drop_duplicates()
sort = sort.sort_values(kind = "quicksort", ascending = False)
display(sort[(sort >= Lower_threshold) & (sort < Upper_threshold)])
print("Correlation between the pairs whose correlation is more than .90")
correlation_matrix(Vehicle_out_fixed, Lower_threshold = 0.9, Upper_threshold =1.0)
print("Correlation between the pairs whose correlation is beween .80 and .90")
correlation_matrix(Vehicle_out_fixed, Lower_threshold = 0.8, Upper_threshold =.9)
print("Correlation between the pairs whose correlation is beween .70 and .80")
correlation_matrix(Vehicle_out_fixed, Lower_threshold = 0.7, Upper_threshold =.8)
# Absolute correlation of independent variables with the target variable
absCorrwithDep = []
allVars = Vehicle_out_fixed.drop('class', axis = 1).columns
for var in allVars:
absCorrwithDep.append(abs(Vehicle_out_fixed['class'].corr(Vehicle_out_fixed[var])))
display(pd.DataFrame([allVars, absCorrwithDep], index = ['Variable', 'Correlation']).T.\
sort_values('Correlation', ascending = False))
1) scatter_ratio and scaled_variance.1, scatter_ratio and pr.axis_rectangularity, pr.axis_rectangularity and scaled_variance.1, scatter_ratio and elongatedness, circularityan and max.length_rectangularity, elongatedness and scaled_variance.1, scatter_ratio and scaled_variance, scaled_variance and scaled_variance.1, elongatedness and pr.axis_rectangularity, elongatedness and scaled_variance, pr.axis_rectangularity and scaled_variance, circularity and scaled_radius_of_gyration, distance_circularity and elongatedness, distance_circularity and scatter_ratio, scaled_radius_of_gyration.1 and hollows_ratio are correlated with each other with a correlation coeff greater than 0.9.
2) elongatedness, scatter_ratio, pr.axis_rectangularity, scaled_variance.1, distance_circularity and radius_ratio are some columns which have relatively strong correlation with the class variable
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(Vehicle_out_fixed.values, i) for i in range(Vehicle_out_fixed.shape[1])]
vif["features"] = Vehicle_out_fixed.columns
vif.round(1)
hollows_ratio, skewness_about.2, scatter_ratio, pr.axis_rectangularity, max.length_rectangularity
Vehicle_out_fixed.drop(['hollows_ratio', 'skewness_about.2', 'scatter_ratio','pr.axis_rectangularity',
'max.length_rectangularity'], axis = 1, inplace = True)
vif2 = pd.DataFrame()
vif2["VIF Factor"] = [variance_inflation_factor(Vehicle_out_fixed.values, i) for i in range(Vehicle_out_fixed.shape[1])]
vif2["features"] = Vehicle_out_fixed.columns
vif2
# pairplot of the Vehicle_out_fixed after dropping the columns
sns.pairplot(Vehicle_out_fixed, hue='class');
fig = plt.figure(figsize = (15,12))
sns.heatmap(Vehicle_out_fixed.corr(),annot=True,linewidths=.05);
# Univariate boxplot of all the variables in Vehicle_out_fixed dataframe
fig = plt.figure(figsize = (16,20))
i=1
for col in Vehicle_out_fixed.columns:
plt.subplot(5, 4, i)
sns.distplot(Vehicle_out_fixed[col],color = 'r')
i=i+1
plt.show()
# Univariate boxplot of all the variables in Vehicle_out_fixed dataframe
fig = plt.figure(figsize = (16,15))
i=1
for col in Vehicle_out_fixed.columns:
plt.subplot(4, 4, i)
sns.boxplot(data = Vehicle_out_fixed , y=col, x='class')
i=i+1
plt.show()
# Univariate swarmplot of all the variables in Vehicle_out_fixed dataframe
fig = plt.figure(figsize = (18,20))
i=1
for col in Vehicle_out_fixed.columns:
plt.subplot(5, 4, i)
sns.swarmplot(data=Vehicle_out_fixed, y=col,x='class')
i=i+1
plt.show()
1) compactness of Class 3 is low in comparison to Class 1 and Class 2
2) max.length_aspect_ratio is low for Class 2 items
3) scaled_variance and scaled_variance.1 is low for class 3 when compared with class 2 and 1
sns.countplot(Vehicle_out_fixed['class'])
# Plotting the Scatter plot
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 3, 1)
sns.scatterplot(Vehicle_out_fixed["elongatedness"],Vehicle_out_fixed["distance_circularity"])
plt.subplot(2, 3, 2)
sns.scatterplot(Vehicle_out_fixed["elongatedness"],Vehicle_out_fixed["scaled_variance"])
plt.subplot(2, 3, 3)
sns.scatterplot(Vehicle_out_fixed["scaled_radius_of_gyration"],Vehicle_out_fixed["circularity"])
plt.subplot(2, 3, 4)
sns.scatterplot(Vehicle_out_fixed["elongatedness"],Vehicle_out_fixed["distance_circularity"],hue=Vehicle_out_fixed["class"],palette='Set1')
plt.subplot(2, 3, 5)
sns.scatterplot(Vehicle_out_fixed["elongatedness"],Vehicle_out_fixed["scaled_variance"],hue=Vehicle_out_fixed["class"],palette='Set1')
plt.subplot(2, 3, 6)
sns.scatterplot(Vehicle_out_fixed["scaled_radius_of_gyration"],Vehicle_out_fixed["circularity"],hue=Vehicle_out_fixed["class"],palette='Set1')
plt.show()
1) scaled_variance and elongatedness are negatively correlated. For Class 2 items, when scaled_variance is high, elongatedness is less. For few Class 3 items, when elongatedness is between 42 to 47, scaled_variance is high.
2) circularity and scaled_radius_of_gyration are directly proportional and positively correlated.
# Vehicle_out_fixed = Vehicle_out_fixed_temp
Vehicle_out_fixed_temp = Vehicle_out_fixed
# Predictors
# Dropping the class column from predictors
X = Vehicle_out_fixed.drop('class',axis=1)
# Target
y = Vehicle_out_fixed['class']
X.info() # We can see that class Variable is dropped from the predictor dataframe
print('class = 1 in Vehicle_out_fixed_temp dataframe are {} %'.format(len(Vehicle_out_fixed_temp[Vehicle_out_fixed_temp['class'] == 1])/len(Vehicle_out_fixed_temp)*100))
print('class = 2 in Vehicle_out_fixed_temp dataframe are {} %'.format(len(Vehicle_out_fixed_temp[Vehicle_out_fixed_temp['class'] == 2])/len(Vehicle_out_fixed_temp)*100))
print('class = 3 in Vehicle_out_fixed_temp dataframe are {} %'.format(len(Vehicle_out_fixed_temp[Vehicle_out_fixed_temp['class'] == 3])/len(Vehicle_out_fixed_temp)*100))
XScaled=X.apply(zscore)
XScaled.head()
# Split X and y into training and test set in 70:30 ratio
# Stratify is used to make sure that data is classided proportionally basis on the Churn variable
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size=0.30, stratify=y)
print('class = 1 in original Tel_Customer_Churn dataframe are {} %'.format(len(Vehicle_out_fixed_temp[Vehicle_out_fixed_temp['class'] == 1])/len(Vehicle_out_fixed_temp)*100))
print('class = 2 in original Tel_Customer_Churn dataframe are {} %'.format(len(Vehicle_out_fixed_temp[Vehicle_out_fixed_temp['class'] == 2])/len(Vehicle_out_fixed_temp)*100))
print('class = 3 in original Tel_Customer_Churn dataframe are {} %'.format(len(Vehicle_out_fixed_temp[Vehicle_out_fixed_temp['class'] == 3])/len(Vehicle_out_fixed_temp)*100))
print('class = 1 in Train dataframe are {} %'.format(len(y_train[y_train == 1])/len(y_train)*100))
print('class = 2 in Train dataframe are {} %'.format(len(y_train[y_train == 2])/len(y_train)*100))
print('class = 3 in Train dataframe are {} %'.format(len(y_train[y_train == 3])/len(y_train)*100))
print('class = 1 in Test dataframe are {} %'.format(len(y_test[y_test == 1])/len(y_test)*100))
print('class = 2 in Test dataframe are {} %'.format(len(y_test[y_test == 2])/len(y_test)*100))
print('class = 3 in Test dataframe are {} %'.format(len(y_test[y_test == 3])/len(y_test)*100))
# SVC with hyperparameter tuning -- Original Features
random_state = 42
scoring = 'f1_macro'
svc = SVC(random_state = random_state)
params = {'C': [0.01, 0.05, 0.5, 1], 'kernel': ['linear', 'rbf']}
skf = StratifiedKFold(n_splits = 10)
grid_svc_f = GridSearchCV(svc, param_grid = params, n_jobs = -1, cv = skf)
# Fit the model
grid_svc_f.fit(X_train, y_train)
y_true, y_pred = y_test, grid_svc_f.predict(X_test)
print('SVC Scores with Hyperparameter Tuning for original features\n')
print('Best Hyper Parameters are: ', grid_svc_f.best_params_)
print('Best Score is: ', grid_svc_f.best_score_.round(3))
print('SVC accuracy for train set: {0:.3f}'.format(grid_svc_f.score(X_train, y_train)))
print('SVC accuracy for test set: {0:.3f}'.format(grid_svc_f.score(X_test, y_test)))
# Cross Validation Score
grid_svc_f_score = cross_val_score(grid_svc_f, X_train, y_train, cv = skf, scoring = scoring)
print('SVC cross validation training score: ', round(grid_svc_f_score.mean(), 3).astype(str))
# Classification Report
print('\n{}'.format(classification_report(y_true, y_pred)))
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print('\nConfusion Matrix:\n', cm)
SVC_Original_Train_Accuracy = grid_svc_f.score(X_train, y_train)
SVC_Original_Test_Accuracy = grid_svc_f.score(X_test, y_test)
SVC_Original_Test_cross_val_score = round(grid_svc_f_score.mean(), 3).astype(str)
SVC_Original_Test_Recall = recall_score(y_test,y_pred, average='micro')
SVC_Original_Test_precision = precision_score(y_test,y_pred, average='micro')
SVC_Original_Test_f1 = f1_score(y_test,y_pred, average='micro')
from sklearn import metrics
print("Confusion Matrix - SVC Model Original")
cm=metrics.confusion_matrix(y_test, y_pred, labels=[1,2,3])
df_cm = pd.DataFrame(cm, index = [i for i in [1,2,3]],
columns = [i for i in ["Predict 1","Predict 2","Predict 3"]])
plt.figure(figsize = (5,4))
sns.heatmap(df_cm, annot=True, fmt='g');
# del resultsDf
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame({'Method':['SVC - Original'], 'Train Accuracy': SVC_Original_Train_Accuracy,
'Test Accuracy': SVC_Original_Test_Accuracy, 'Recall': SVC_Original_Test_Recall,
'Precision': SVC_Original_Test_precision,'F1 Score': SVC_Original_Test_f1,
'Cross validation score': SVC_Original_Test_cross_val_score})
resultsDf = resultsDf[['Method', 'Train Accuracy', 'Test Accuracy', 'Recall','Precision','F1 Score','Cross validation score']]
resultsDf.reset_index(drop=True, inplace=True)
resultsDf
covMatrix = np.cov(X_train,rowvar=False)
print(covMatrix)
pca = PCA(n_components=13)
pca.fit(X_train)
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,14)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,14)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
1) We can see that there is a variance explained with increase in number of PC's.
2) We will proceed with 6 components here which covers more than 95% of variance.
pca2 = PCA(n_components=6)
pca2.fit(X_train)
print(pca2.components_)
print(pca2.explained_variance_ratio_)
Xpca2_train = pca2.transform(X_train)
Xpca2_test = pca2.transform(X_test)
pd.DataFrame(Xpca2_train)
pd.DataFrame(y_train)
sns.pairplot(pd.DataFrame(Xpca2_train))
# Creating a dimension reduced with predictors and target
Vehicle_reduced_feature_train = pd.DataFrame(Xpca2_train).join(pd.DataFrame(y_train, columns = ['class']), how = 'left', sort = False)
Vehicle_reduced_feature_test = pd.DataFrame(Xpca2_test).join(pd.DataFrame(y_test, columns = ['class']), how = 'left', sort = False)
Vehicle_reduced_feature_train.shape, Vehicle_reduced_feature_test.shape
# Plotting the Scatter plot for all the vraibales in the Vehicle_reduced_feature_train dataset
fig = plt.figure(figsize = (15,15))
targets = [3, 2, 1]; colors = ['r', 'g', 'b']
plt.subplot(3, 3, 1)
sns.scatterplot(Vehicle_reduced_feature_train[0],Vehicle_reduced_feature_train[1],hue=Vehicle_reduced_feature_train['class'],palette='Set1')
plt.title("Scatter Plot of first and second column")
plt.subplot(3, 3, 2)
sns.scatterplot(Vehicle_reduced_feature_train[1],Vehicle_reduced_feature_train[2],hue=Vehicle_reduced_feature_train['class'],palette='Set1')
plt.title("Scatter Plot of Second and Third column")
plt.subplot(3, 3, 3)
sns.scatterplot(Vehicle_reduced_feature_train[2],Vehicle_reduced_feature_train[3],hue=Vehicle_reduced_feature_train['class'],palette='Set1')
plt.title("Scatter Plot of Third and fourth column")
plt.subplot(3, 3, 4)
sns.scatterplot(Vehicle_reduced_feature_train[3],Vehicle_reduced_feature_train[4],hue=Vehicle_reduced_feature_train['class'],palette='Set1')
plt.title("Scatter Plot of fourth and Fifth column")
plt.subplot(3, 3, 5)
sns.scatterplot(Vehicle_reduced_feature_train[4],Vehicle_reduced_feature_train[5],hue=Vehicle_reduced_feature_train['class'],palette='Set1')
plt.title("Scatter Plot of Fifth and Sixth column")
plt.subplot(3, 3, 6)
sns.scatterplot(Vehicle_reduced_feature_train[5],Vehicle_reduced_feature_train[0],hue=Vehicle_reduced_feature_train['class'],palette='Set1')
plt.title("Scatter Plot of Sixth and First column")
plt.show()
### kdeplot for all the variables except class
fig = plt.figure(figsize = (15,8))
fig.suptitle('Distribution for Car, Bus, Van for Principal Components', fontsize = 14)
t1 = Vehicle_reduced_feature_train[Vehicle_reduced_feature_train['class'] == 1]
t2 = Vehicle_reduced_feature_train[Vehicle_reduced_feature_train['class'] == 2]
t3 = Vehicle_reduced_feature_train[Vehicle_reduced_feature_train['class'] == 3]
i=0
features = [f for f in Vehicle_reduced_feature_train.columns if f not in ['class']]
for col in features:
i=i+1
plt.subplot(2, 3, i)
sns.kdeplot(t1[col], bw = 0.5, label = 'Car')
sns.kdeplot(t2[col], bw = 0.5, label = 'Bus')
sns.kdeplot(t3[col], bw = 0.5, label = 'Van')
plt.show()
### kdeplot for all the variables except class
fig = plt.figure(figsize = (15,8))
fig.suptitle('Most of the principal components are normally distributed in both train and test set', fontsize = 20)
ttr = Vehicle_reduced_feature_train.drop(['class'], axis = 1)
tte = Vehicle_reduced_feature_test.drop(['class'], axis = 1)
i=0
features = [f for f in Vehicle_reduced_feature_train.columns if f not in ['class']]
for col in features:
i=i+1
plt.subplot(2, 3, i)
sns.kdeplot(ttr[col], bw = 0.5, label = 'Train')
sns.kdeplot(tte[col], bw = 0.5, label = 'Test')
plt.show()
# SVC with hyperparameter tuning -- reduced Features
random_state = 42
scoring = 'f1_macro'
svc = SVC(random_state = random_state)
params = {'C': [0.01, 0.05, 0.5, 1], 'kernel': ['linear', 'rbf']}
skf = StratifiedKFold(n_splits = 10)
grid_svc_R = GridSearchCV(svc, param_grid = params, n_jobs = -1, cv = skf)
# Fit the model
grid_svc_R.fit(Xpca2_train, y_train)
y_true, y_pred = y_test, grid_svc_R.predict(Xpca2_test)
print('SVC Scores with Hyperparameter Tuning for Redcued features\n')
print('Best Hyper Parameters for reduced are: ', grid_svc_R.best_params_)
print('Best Score is: ', grid_svc_R.best_score_.round(3))
print('SVC accuracy for train reduced set: {0:.3f}'.format(grid_svc_R.score(Xpca2_train, y_train)))
print('SVC accuracy for test reduced set: {0:.3f}'.format(grid_svc_R.score(Xpca2_test, y_test)))
# Cross Validation Score
grid_svc_R_score = cross_val_score(grid_svc_R, Xpca2_train, y_train, cv = skf, scoring = scoring)
print('SVC cross validation training score: ', round(grid_svc_R_score.mean(), 3).astype(str))
# Classification Report
print('\n{}'.format(classification_report(y_true, y_pred)))
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print('\nConfusion Matrix:\n', cm)
SVC_Reduced_Train_Accuracy = grid_svc_R.score(Xpca2_train, y_train)
SVC_Reduced_Test_Accuracy = grid_svc_R.score(Xpca2_test, y_test)
SVC_Reduced_Test_cross_val_score = round(grid_svc_R_score.mean(), 3).astype(str)
SVC_Reduced_Test_Recall = recall_score(y_test,y_pred, average='micro')
SVC_Reduced_Test_precision = precision_score(y_test,y_pred, average='micro')
SVC_Reduced_Test_f1 = f1_score(y_test,y_pred, average='micro')
from sklearn import metrics
print("Confusion Matrix - SVC Model Original")
cm=metrics.confusion_matrix(y_test, y_pred, labels=[1,2,3])
df_cm = pd.DataFrame(cm, index = [i for i in [1,2,3]],
columns = [i for i in ["Predict 1","Predict 2","Predict 3"]])
plt.figure(figsize = (5,4))
sns.heatmap(df_cm, annot=True, fmt='g');
# del tempResultsDf
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['SVC - Reduced'], 'Train Accuracy': SVC_Reduced_Train_Accuracy,
'Test Accuracy': SVC_Reduced_Test_Accuracy, 'Recall': SVC_Reduced_Test_Recall,
'Precision': SVC_Reduced_Test_precision,'F1 Score': SVC_Reduced_Test_f1,
'Cross validation score': SVC_Reduced_Test_cross_val_score})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Train Accuracy', 'Test Accuracy', 'Recall','Precision','F1 Score','Cross validation score']]
resultsDf.reset_index(drop=True, inplace=True)
resultsDf
# Helper function to plot learning curve
def plot_learning_curve(estimator, X, y, ax, ylim = None, cv = None, n_jobs = 1,
train_sizes = np.linspace(.1, 1.0, 5), name = ' '):
if ylim is not None:
plt.ylim(*ylim)
# First Estimator
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv = cv, n_jobs = n_jobs,
train_sizes = train_sizes)
train_scores_mean = np.mean(train_scores, axis = 1)
train_scores_std = np.std(train_scores, axis = 1)
test_scores_mean = np.mean(test_scores, axis = 1)
test_scores_std = np.std(test_scores, axis = 1)
ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha = 0.1, color = '#ff9124')
ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha = 0.1, color = '#2492ff')
ax.plot(train_sizes, train_scores_mean, 'o-', color = '#ff9124', label = 'Training score')
ax.plot(train_sizes, test_scores_mean, 'o-', color ='#2492ff', label = 'Cross-validation score')
ax.set_title(name, fontsize = 14)
ax.set_xlabel('Training size')
ax.set_ylabel('Score')
ax.grid(True)
ax.legend(loc = 'best')
# Plot training vs cross validation scores
cv = StratifiedKFold(n_splits = 30)
f, ((ax1, ax2)) = plt.subplots(1, 2, figsize = (15, 7.2))
f.suptitle('Training vs Cross Validation Scores', fontsize = 14)
plot_learning_curve(grid_svc_R, Xpca2_train, y_train, cv = cv, n_jobs = 1, ax = ax1,
name = 'Support Vector Classifier \n Principal Components Learning Curve')
plot_learning_curve(grid_svc_f, X_train, y_train, cv = cv, n_jobs = 1, ax = ax2,
name = 'Support Vector Classifier \n Original Features Learning Curve')
resultsDf
We used correlation matrix and checked the relation of each feature with the class column to reduce the number of features in the dataset to 12 from 18.
PCA being a statistical technique to reduce the dimensionality of the data by the selecting the most important features that captures maximum information about the dataset, does the task here. Here we've reduced the dimension from 12 to 6 and selected those which explained 95% variance. Doing that it removes the correlated features as well, which we saw in the scatterplot before and after PCA.
However, some of the limitations which are clearly seen in this use case are: after implementing PCA on the dataset, we saw features getting converted into principal components. Principal components are the linear combination of original features. This makes the features less interpretable. Additionally, we know that one of limitation of PCA is it assumes linearity i.e. principal components are a linear combinations of the original features, which if not true will not give a sensible results.
We have applied Support Vector Classifier on the Original feature data and reduced feature data (PCA) and below are the analysis details-
Training Accuracy score of 97.1 % (Original) and 93.0 % (Reduced)
Testing Accuracy score of 89.3.9% (Original) and 82.6 % (Reduced)
Cross validation score of 94.7% (Original) and 90.8% (Reduced)
Recall, precision and F1 Score with original dataset is 89.3.9% whereas with Reduced features it is 82.6 % respectively
From above we can see that Accuracy and Cross validation score are better for original features instead when SVC is spplied on the reduced features (PCA).
============================================================================================================================
Here the Shape of dataset we were dealing with was 846 rows and 12 features + 1 class column. Effect of PCA can be more useful in large datasets with more features
Based on learning curve, we can conclude that for SVC with principal components and original features, both training and validation scores increases with the increase in size of the dataset, which would mean the scores can further increase with more training samples. Howevver, the gap between training and validation score for SVC with principal component is higher than then the feature dataset.
====================================================================================================================
• DOMAIN: Sports management
• CONTEXT: TCompany X is a sports management company for international crickethe data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes
• DATA DESCRIPTION: The data is collected belongs to batsman from IPL series conducted so far.
Attribute Information:
Runs: Runs score by the batsman
Ave: Average runs scored by the batsman per match
SR: strike rate of the batsman
Fours: number of boundary/four scored
Six: number of boundary/six scored
HF: number of half centuries scored so far
• PROJECT OBJECTIVE: Goal is to build a data driven batsman ranking model for the sports management company to make business decisions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import zscore
from scipy import stats
from sklearn.decomposition import PCA
players = pd.read_csv("Part4 - batting_bowling_ipl_bat.csv")
print('players Dataframe shape is : {}'.format(players.shape))
players.info()
1) Total number of rows in Vehicle dataset are 90
2) Total number of columns in Vehicle dataset are 7
3) 6 columns are with Float64 datatype
4) 1 column is with Object dataype
5) All the column values as per the info command are non-null but if you see the rangindex it has 180 entries
players.head()
players[players['Name'].isnull()]
# Drop the rows whcih have NULL values
players_new = players.dropna(how='all')
players_new = players_new.reset_index(drop=True)
players_new.head()
players_new.describe(include='all').T
fig = plt.figure(figsize = (8,6))
sns.heatmap(players_new.corr(),annot=True,linewidths=.05);
sns.pairplot(players_new, diag_kind='kde')
# try ploting distribution graphs.
players_new.hist(figsize=(15,8),layout = (2,3));
players_new1 = players_new.copy()
players_new1 = players_new1.drop(['Name'],axis=1)
# Univariate boxplot of all the variables in Vehicle_out_fixed dataframe
fig = plt.figure(figsize = (12,8))
i=1
for col in players_new1.columns:
plt.subplot(2, 3, i)
sns.boxplot(data = players_new1 , y=col)
i=i+1
plt.show()
# Univariate boxplot of all the variables in Vehicle_out_fixed dataframe
fig = plt.figure(figsize = (12,8))
i=1
for col in players_new1.columns:
plt.subplot(2, 3, i)
sns.swarmplot(data = players_new1 , y=col)
i=i+1
plt.show()
players_data_scaled = players_new1.apply(zscore)
players_data_scaled
# Trying the K-means clusters with K=3
players_data_model = KMeans(3)
players_data_model.fit(players_data_scaled)
players_group = players_data_model.predict(players_data_scaled)
players_data_scaled['Predicted_group'] = players_group
players_new['Predicted_group'] = players_group
players_data_scaled_grouped = players_data_scaled.groupby(['Predicted_group'])
players_data_scaled_grouped.boxplot(figsize=(14,10),layout = (2,2));
players_data_scaled_grouped.mean()
players_data_scaled.head()
players_new.head()
pca = PCA()
pca.fit(players_data_scaled.drop('Predicted_group', axis=1))
plt.bar(list(range(1,7)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
pca2 = PCA(n_components=1)
pca2.fit(players_data_scaled.drop('Predicted_group', axis=1))
print('PCA components are {}'.format(pca2.components_))
print('Variance ratio : {}'.format(pca2.explained_variance_ratio_))
Xpca2 = pca2.transform(players_data_scaled.drop('Predicted_group', axis=1))
players_new['X_PCA'] = Xpca2
players_data_scaled['X_PCA'] = Xpca2
players_data_sorted = players_new.sort_values(by='X_PCA', ascending=False)
players_data_sorted['Ranking'] = list(range(1,91))
players_data_scaled = players_data_scaled.sort_values(by='X_PCA', ascending=False)
players_data_scaled['Ranking'] = list(range(1,91))
players_data_sorted
players_data_scaled.head()
X=players_data_scaled.drop(['Predicted_group', 'X_PCA', 'Ranking'], axis=1)
y=players_data_scaled['Ranking']
regression_model = LinearRegression()
regression_model.fit(X,y)
print('Accuracy of the model is {}'.format(regression_model.score(X, y)))
### As per EDA also we can see that rakings are fine basis on the Runs, Half century and Average
players_data_sorted_EDA = players_data_sorted[players_data_sorted['Runs'] > 400]
players_data_sorted_EDA
players_data_sorted_EDA = players_data_sorted_EDA[players_data_sorted_EDA['Fours'] >= 40]
players_data_sorted_EDA
players_data_sorted_EDA = players_data_sorted_EDA[players_data_sorted_EDA['HF'] >= 5]
players_data_sorted_EDA
===========================================================================================================
Missing Value Ratio: If the dataset has too many missing values, we use this approach to reduce the number of variables. We can drop the variables having a large number of missing values in them
Low Variance filter: We apply this approach to identify and drop constant variables from the dataset. The target variable is not unduly affected by variables with low variance, and hence these variables can be safely dropped
High Correlation filter: A pair of variables having high correlation increases multicollinearity in the dataset. So, we can use this technique to find highly correlated features and drop them accordingly
Random Forest: This is one of the most commonly used techniques which tells us the importance of each feature present in the dataset. We can find the importance of each feature and keep the top most features, resulting in dimensionality reduction Both Backward Feature Elimination and Forward Feature Selection techniques take a lot of computational time and are thus generally used on smaller datasets
Factor Analysis: This technique is best suited for situations where we have highly correlated set of variables. It divides the variables based on their correlation into different groups, and represents each group with a factor
Principal Component Analysis: This is one of the most widely used techniques for dealing with linear data. It divides the data into a set of components which try to explain as much variance as possible
Singular Value Decomposition: The purpose of Singular Value Decomposition is to simplify a matrix and make doing calculations with the matrix easier. The matrix is reduced to its constituent parts, similar to the goal of PCA.
Exapmple of SVD: One of the most common ways that SVD is used is to compress images
Linear Discriminant Analysis: Linear Discriminant Analysis operates by projecting data from a multidimensional graph onto a linear graph. The easiest way to conceive of this is with a graph filled up with data points of two different classes. Assuming that there is no line that will neatly separate the data into two classes, the two dimensional graph can be reduced down into a 1D graph. This 1D graph can then be used to hopefully achieve the best possible separation of the data points.
When LDA is carried out there are two primary goals: minimizing the variance of the two classes and maximizing the distance between the means of the two data classes.
Example of LDA: LDA can be used as a classification algorithm in addition to carrying out dimensionality reduction.
Independent Component Analysis: We can use ICA to transform the data into independent components which describe the data using less number of components
ISOMAP: We use this technique when the data is strongly non-linear
t-SNE: This technique also works well when the data is strongly non-linear. It works extremely well for visualizations as well
UMAP: This technique works well for high dimensional data. Its run-time is shorter as compared to t-SNE
One of the most common ways that SVD is used is to compress images. After all, the pixel values that make up the red, green, and blue channels in the image can just be reduced and the result will be an image that is less complex but still contains the same image content. Let's try using SVD to compress an image and render it.
import numpy
import PIL
from PIL import Image
function to load the image and turn it into a Numpy array. We then want to select the red, green, and blue color channels from the image
def load_image(image):
image = Image.open(image)
im_array = numpy.array(image)
red = im_array[:, :, 0]
green = im_array[:, :, 1]
blue = im_array[:, :, 2]
return red, green, blue
Now that we have the colors, we need to compress the color channels. We can start by calling Numpy's SVD function on the color channel we want. We'll then create an array of zeroes that we'll fill in after the matrix multiplication is completed. We then specify the singular value limit we want to use when doing the calculations:
def channel_compress(color_channel, singular_value_limit):
u, s, v = numpy.linalg.svd(color_channel)
compressed = numpy.zeros((color_channel.shape[0], color_channel.shape[1]))
n = singular_value_limit
left_matrix = numpy.matmul(u[:, 0:n], numpy.diag(s)[0:n, 0:n])
inner_compressed = numpy.matmul(left_matrix, v[0:n, :])
compressed = inner_compressed.astype('uint8')
return compressed
# path=r'C:\Users\Anuj Sachdeva\Desktop\Great Learning\06 Unsupervised Learning\Project\dog3.jpg'
red, green, blue = load_image('Tree.jpeg')
singular_val_lim = 350
After this, we do matrix multiplication on the diagonal and the value limits in the U matrix, as described above. This gets us the left matrix and we then multiply it with the V matrix. This should get us the compressed values which we transform to the ‘uint8' type:
def compress_image(red, green, blue, singular_val_lim):
compressed_red = channel_compress(red, singular_val_lim)
compressed_green = channel_compress(green, singular_val_lim)
compressed_blue = channel_compress(blue, singular_val_lim)
im_red = Image.fromarray(compressed_red)
im_blue = Image.fromarray(compressed_blue)
im_green = Image.fromarray(compressed_green)
new_image = Image.merge("RGB", (im_red, im_green, im_blue))
new_image.show()
new_image.save("Tree-edited.jpeg")
compress_image(red, green, blue, singular_val_lim)
Attaching Input File 'Tree.jpeg' - 202 kb
Attaching Output Compressed File 'Tree-edited.jpeg'- 105 kb
===========================================================================================================================